package com.scrapy4j.htmlp;


import com.scrapy4j.htmlp.extract.Attr;
import com.scrapy4j.htmlp.extract.Content;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * html 属性自动提取工具
 *
 * <code>
 *     // 提取正文
 *     String txt = HtmLP.getContent(html).getTxt(); // 正文不带标签
 *     String content = HtmLP.getContent(html).getContent(); // 正文带标签
 *     // 提取标题
 *     String title = HtmLP.getTitle(html, metaTitle);
 *     // 提取作者
 *     String author = HtmLP.getAuthor(txt);
 *     // 提取时间
 *     String time = HtmLP.getTime(html);
 *
 * </code>
 */
public class HtmLP {
    public HtmLP() {}

    public static List<String> extractAuthor(String content) {
        return null;
    }

    public static String getTime(String content) {
        String time = null;
        // 2017-11-23 14:52:23
        String regex = "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-9]{1,2})";
        Pattern pattern = Pattern.compile(regex);
        Matcher matcher = pattern.matcher(content);
        if(matcher.find()) {
            time = String.format("%s-%s-%s %s:%s:%s", matcher.group(1), matcher.group(2), matcher.group(3), matcher.group(4), matcher.group(5), matcher.group(6));

            if(isValidDate(time, "yyyy-MM-dd HH:mm:ss")) {
                return time;
            }
        }

        // 2017-11-23 14:52
        regex = "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})[^0-9]{1,5}?([0-2]?[0-9])[^0-9]{1,5}?([0-9]{1,2})";
        pattern = Pattern.compile(regex);
        matcher = pattern.matcher(content);
        if(matcher.find()) {
            time = String.format("%s-%s-%s %s:%s", matcher.group(1), matcher.group(2), matcher.group(3), matcher.group(4), matcher.group(5));
            if(isValidDate(time, "yyyy-MM-dd HH:mm")) {
                return time;
            }
        }

        // 2017-11-23
        regex = "([1-2][0-9]{3})[^0-9]{1,5}?([0-1]?[0-9])[^0-9]{1,5}?([0-9]{1,2})";
        pattern = Pattern.compile(regex);
        matcher = pattern.matcher(content);
        if(matcher.find()) {
            time = String.format("%s-%s-%s", matcher.group(1), matcher.group(2), matcher.group(3));
            if(isValidDate(time, "yyyy-MM-dd")) {
                return time;
            }
        }

        return null;
    }

    /**
     * 提取网页相似标题
     *
     * @param content
     * @param metaTitle
     * @return
     */
    public static String getTitle(String content, String metaTitle) {
        String title = metaTitle;

        content = content.replaceAll("\n", "");

        // 取出 h1-h6 的内容
        String regex = "<h[1-6]+.*?>([\\s\\S]+?)</h[1-6]+>";
        Pattern pattern = Pattern.compile(regex);
        Matcher matcher = pattern.matcher(content);

        List<String> titleList = new ArrayList<String>();
        List<Integer> titleSimList = new ArrayList<Integer>();

        int simNumber = 4;
        while (matcher.find()) {
            String source = matcher.group(1);
            source = source.replaceAll("<.*?>", "").trim();
            if(simTitle(source, metaTitle) > simNumber) {
                title = source;
            }
        }

        return title;
    }

    // 提取网页正文
    public static Content getContent(String body) {
        return Content.parse(body);
    }

    /**
     * 提取网页面中相似标题中重复字数的个数
     * @param source
     * @param target
     * @return
     */
    protected static int simTitle(String source, String target) {
        // source: 你好中国
        // source: 你好中国人
        // target: 你好中国-新浪
        if(source.length() == 0 || target.length() == 0) {
            return 0;
        }

        int n = 0;// 相同个数
        String shortTemp = source.length() < target.length() ? source : target;
        String longTemp = source.length() < target.length() ? target: source;

        if(longTemp.contains(shortTemp)) {
            n = longTemp.length() - longTemp.replace(shortTemp, "").length();
        }
        return n;
    }

    public static String getSource(String body) {
        return Attr.parse(body, "来源");
    }

    public static String getSource(String metaTitle, String title) {
        String tempTitle = metaTitle.equalsIgnoreCase(title) ? metaTitle : metaTitle.replace(title, "").replaceAll("[【】]", "");
        tempTitle = tempTitle.replaceAll("\\-|_|\\|", "#space#");
        String[] temps = tempTitle.split("#space#");
        for (int i = 0; i < temps.length; i++) {
            if(!temps[i].isEmpty()) {
                tempTitle = temps[i];
            }
        }
        return tempTitle.equalsIgnoreCase(metaTitle) ? "" : tempTitle;
    }

    public static String getAuthor(String body) {
        return Attr.parse(body, "作者|编辑|小编|Author|本帖最后由|责任编辑|记者|责任人", 15);
    }

    public static boolean isValidDate(String date, String format) {
        SimpleDateFormat dateFormat = new SimpleDateFormat(format);
        try {

            Date source = dateFormat.parse(date);

            if(date.contains(":")) {
                String[] times = date.trim().split(" ")[1].split(":");

                int hour = 0;
                int minute = 0;
                int second = 0;

                switch (times.length) {
                    case 1:
                        hour = Integer.parseInt(times[0]);
                        break;
                    case 2:
                        hour = Integer.parseInt(times[0]);
                        minute = Integer.parseInt(times[1]);
                        break;
                    case 3:
                        hour = Integer.parseInt(times[0]);
                        minute = Integer.parseInt(times[1]);
                        second = Integer.parseInt(times[2]);
                        break;
                }

                if(hour > 24 || minute > 60 || second > 60) {
                    return false;
                }

            }


            Date now = dateFormat.parse(dateFormat.format(new Date()));

            if(now.compareTo(source) == -1) {
                return false;
            }
        } catch (ParseException e) {
            return false;
        }


        return true;
    }
}
